library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
Data from the speech features
pd_speech_features <- as.data.frame(read_excel("~/GitHub/FCA/Data/pd_speech_features.xlsx",sheet = "pd_speech_features", range = "A2:ACB758"))
Each subject had three repeated observations. Here I’ll use the average of the three experiments per subject.
rep1Parkison <- subset(pd_speech_features,RID==1)
rownames(rep1Parkison) <- rep1Parkison$id
rep1Parkison$id <- NULL
rep1Parkison$RID <- NULL
rep1Parkison[,1:ncol(rep1Parkison)] <- sapply(rep1Parkison,as.numeric)
rep2Parkison <- subset(pd_speech_features,RID==2)
rownames(rep2Parkison) <- rep2Parkison$id
rep2Parkison$id <- NULL
rep2Parkison$RID <- NULL
rep2Parkison[,1:ncol(rep2Parkison)] <- sapply(rep2Parkison,as.numeric)
rep3Parkison <- subset(pd_speech_features,RID==3)
rownames(rep3Parkison) <- rep3Parkison$id
rep3Parkison$id <- NULL
rep3Parkison$RID <- NULL
rep3Parkison[,1:ncol(rep3Parkison)] <- sapply(rep3Parkison,as.numeric)
whof <- !(colnames(rep1Parkison) %in% c("gender","class"));
avgParkison <- rep1Parkison;
avgParkison[,whof] <- (rep1Parkison[,whof] + rep2Parkison[,whof] + rep3Parkison[,whof])/3
signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(avgParkison) %in% c("gender","class"));
avgParkison[,whof] <- signedlog(avgParkison[,whof])
studyName <- "Parkinsons"
dataframe <- avgParkison
outcome <- "class"
TopVariables <- 10
thro <- 0.80
cexheat = 0.15
Some libraries
library(psych)
library(whitening)
library("vioplot")
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
| rows | col |
|---|---|
| 252 | 753 |
pander::pander(table(dataframe[,outcome]))
| 0 | 1 |
|---|---|
| 64 | 188 |
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1500
Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData
if (!largeSet)
{
hm <- heatMaps(data=dataframe,
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9999953
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> Included: 744 , Uni p: 0.01697637 , Uncorrelated Base: 178 , Outcome-Driven Size: 0 , Base Size: 178
#>
#>
1 <R=1.000,r=0.975,N= 358>, Top: 82( 42 )[ 1 : 82 Fa= 82 : 0.975 ]( 82 , 232 , 0 ),<|>Tot Used: 314 , Added: 232 , Zero Std: 0 , Max Cor: 1.000
#>
2 <R=1.000,r=0.975,N= 358>, Top: 23( 7 )[ 1 : 23 Fa= 105 : 0.975 ]( 23 , 46 , 82 ),<|>Tot Used: 347 , Added: 46 , Zero Std: 0 , Max Cor: 0.996
#>
3 <R=0.996,r=0.973,N= 358>, Top: 11( 4 )[ 1 : 11 Fa= 111 : 0.973 ]( 11 , 14 , 105 ),<|>Tot Used: 361 , Added: 14 , Zero Std: 0 , Max Cor: 0.973
#>
4 <R=0.973,r=0.937,N= 218>, Top: 72( 3 )[ 1 : 72 Fa= 145 : 0.937 ]( 71 , 103 , 111 ),<|>Tot Used: 429 , Added: 103 , Zero Std: 0 , Max Cor: 0.981
#>
5 <R=0.981,r=0.941,N= 218>, Top: 15( 2 )[ 1 : 15 Fa= 151 : 0.941 ]( 15 , 19 , 145 ),<|>Tot Used: 438 , Added: 19 , Zero Std: 0 , Max Cor: 0.955
#>
6 <R=0.955,r=0.877,N= 167>, Top: 54( 3 )[ 1 : 54 Fa= 166 : 0.877 ]( 54 , 85 , 151 ),<|>Tot Used: 475 , Added: 85 , Zero Std: 0 , Max Cor: 0.987
#>
7 <R=0.987,r=0.894,N= 167>, Top: 9( 1 )[ 1 : 9 Fa= 170 : 0.894 ]( 9 , 9 , 166 ),<|>Tot Used: 479 , Added: 9 , Zero Std: 0 , Max Cor: 0.893
#>
8 <R=0.893,r=0.800,N= 197>, Top: 62( 6 )[ 1 : 62 Fa= 185 : 0.824 ]( 59 , 108 , 170 ),<|>Tot Used: 516 , Added: 108 , Zero Std: 0 , Max Cor: 0.984
#>
9 <R=0.984,r=0.842,N= 197>, Top: 14( 1 )[ 1 : 14 Fa= 190 : 0.842 ]( 14 , 15 , 185 ),<|>Tot Used: 520 , Added: 15 , Zero Std: 0 , Max Cor: 0.853
#>
10 <R=0.853,r=0.800,N= 24>, Top: 11( 1 )[ 1 : 11 Fa= 192 : 0.800 ]( 11 , 13 , 190 ),<|>Tot Used: 523 , Added: 13 , Zero Std: 0 , Max Cor: 0.869
#>
11 <R=0.869,r=0.800,N= 24>, Top: 1( 1 )[ 1 : 1 Fa= 193 : 0.800 ]( 1 , 1 , 192 ),<|>Tot Used: 523 , Added: 1 , Zero Std: 0 , Max Cor: 0.797
#>
12 <R=0.797,r=0.800,N= 0>
#>
[ 12 ], 0.7971068 Decor Dimension: 523 Nused: 523 . Cor to Base: 229 , ABase: 13 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
718
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
302
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
4.83
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
3.61
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPSTM <- attr(DEdataframe,"UPSTM")
gplots::heatmap.2(1.0*(abs(UPSTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after IDeA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.7971068
classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])
datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
100 : std_MFCC_2nd_coef 200 : app_entropy_log_3_coef 300 :
app_LT_TKEO_mean_7_coef 400 : tqwt_entropy_log_dec_15 500 :
tqwt_medianValue_dec_7
600 : tqwt_stdValue_dec_35 700 : tqwt_skewnessValue_dec_27
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
100 : La_std_MFCC_2nd_coef 200 : La_app_entropy_log_3_coef 300 :
La_app_LT_TKEO_mean_7_coef 400 : La_tqwt_entropy_log_dec_15 500 :
tqwt_medianValue_dec_7
600 : tqwt_stdValue_dec_35 700 : tqwt_skewnessValue_dec_27
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| std_delta_delta_log_energy | 0.251 | 0.825 | -0.736 | 0.804 | 0.483 | 0.798 |
| std_delta_log_energy | 0.250 | 0.841 | -0.690 | 0.787 | 0.469 | 0.794 |
| std_9th_delta_delta | 0.347 | 0.952 | -0.611 | 0.674 | 0.766 | 0.787 |
| std_8th_delta_delta | 0.319 | 0.941 | -0.598 | 0.595 | 0.862 | 0.780 |
| std_7th_delta_delta | 0.324 | 0.905 | -0.558 | 0.647 | 0.977 | 0.776 |
| tqwt_entropy_log_dec_12 | -0.147 | 0.876 | 0.764 | 0.876 | 0.676 | 0.770 |
| std_6th_delta_delta | 0.311 | 0.851 | -0.470 | 0.540 | 0.896 | 0.768 |
| std_8th_delta | 0.310 | 0.950 | -0.587 | 0.637 | 0.971 | 0.767 |
| std_9th_delta | 0.306 | 0.885 | -0.519 | 0.660 | 0.330 | 0.764 |
| tqwt_entropy_shannon_dec_12 | -0.282 | 0.940 | 0.593 | 0.833 | 0.145 | 0.763 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]
pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| std_delta_delta_log_energy | 0.2506 | 0.825 | -0.736 | 0.804 | 4.83e-01 | 0.798 |
| std_8th_delta_delta | 0.3190 | 0.941 | -0.598 | 0.595 | 8.62e-01 | 0.780 |
| tqwt_entropy_shannon_dec_12 | -0.2819 | 0.940 | 0.593 | 0.833 | 1.45e-01 | 0.763 |
| La_tqwt_entropy_shannon_dec_33 | -0.0534 | 0.230 | 0.197 | 0.357 | 3.85e-01 | 0.759 |
| mean_MFCC_2nd_coef | -0.3598 | 1.433 | -1.933 | 1.997 | 2.87e-06 | 0.753 |
| La_tqwt_entropy_shannon_dec_20 | -0.1988 | 0.533 | 0.093 | 0.263 | 4.87e-01 | 0.744 |
| La_tqwt_entropy_shannon_dec_17 | -0.1131 | 0.508 | 0.135 | 0.175 | 6.94e-01 | 0.734 |
| La_tqwt_stdValue_dec_32 | 0.0206 | 0.168 | -0.156 | 0.273 | 2.33e-01 | 0.734 |
| La_tqwt_kurtosisValue_dec_33 | 0.0505 | 0.377 | -0.301 | 0.511 | 1.54e-01 | 0.732 |
| tqwt_energy_dec_27 | 0.1691 | 0.714 | -0.428 | 0.771 | 4.74e-02 | 0.725 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))
theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
| mean | total | fraction |
|---|---|---|
| 2.49 | 481 | 0.646 |
allSigvars <- names(dc)
dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
coef <- theFormulas[[dx]]
cname <- names(theFormulas[[dx]])
names(cname) <- cname
for (cf in names(coef))
{
if (cf != dx)
{
if (coef[cf]>0)
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
}
else
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("%5.3f*%s",coef[cf],cname[cf]))
}
}
}
}
finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| DecorFormula | caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | RAWAUC | fscores | |
|---|---|---|---|---|---|---|---|---|---|
| std_delta_delta_log_energy | 0.2506 | 0.825 | -0.73605 | 0.804 | 4.83e-01 | 0.798 | 0.798 | 2 | |
| std_8th_delta_delta | 0.3190 | 0.941 | -0.59810 | 0.595 | 8.62e-01 | 0.780 | 0.780 | 6 | |
| tqwt_entropy_shannon_dec_12 | -0.2819 | 0.940 | 0.59251 | 0.833 | 1.45e-01 | 0.763 | 0.763 | 9 | |
| La_tqwt_entropy_shannon_dec_33 | -0.868tqwt_entropy_shannon_dec_31 + 1.000tqwt_entropy_shannon_dec_33 | -0.0534 | 0.230 | 0.19681 | 0.357 | 3.85e-01 | 0.759 | 0.621 | 0 |
| mean_MFCC_2nd_coef | -0.3598 | 1.433 | -1.93344 | 1.997 | 2.87e-06 | 0.753 | 0.753 | NA | |
| La_tqwt_entropy_shannon_dec_20 | + 1.000tqwt_entropy_shannon_dec_20 + 0.886tqwt_minValue_dec_20 | -0.1988 | 0.533 | 0.09299 | 0.263 | 4.87e-01 | 0.744 | 0.622 | -1 |
| La_tqwt_entropy_shannon_dec_17 | + 1.000tqwt_entropy_shannon_dec_17 + 0.990tqwt_minValue_dec_17 | -0.1131 | 0.508 | 0.13493 | 0.175 | 6.94e-01 | 0.734 | 0.709 | -1 |
| La_tqwt_stdValue_dec_32 | + 1.000tqwt_stdValue_dec_32 -1.002tqwt_stdValue_dec_33 | 0.0206 | 0.168 | -0.15556 | 0.273 | 2.33e-01 | 0.734 | 0.573 | 4 |
| La_tqwt_kurtosisValue_dec_33 | -0.809tqwt_kurtosisValue_dec_31 + 1.000tqwt_kurtosisValue_dec_33 | 0.0505 | 0.377 | -0.30099 | 0.511 | 1.54e-01 | 0.732 | 0.628 | -1 |
| tqwt_energy_dec_27 | 0.1691 | 0.714 | -0.42764 | 0.771 | 4.74e-02 | 0.725 | 0.725 | 1 | |
| tqwt_entropy_shannon_dec_17 | NA | -0.5108 | 1.187 | 0.25698 | 0.652 | 7.06e-03 | 0.709 | 0.709 | NA |
| tqwt_minValue_dec_17 | NA | 0.4016 | 1.110 | -0.12323 | 0.652 | 2.66e-02 | 0.636 | 0.636 | 14 |
| tqwt_kurtosisValue_dec_33 | NA | 0.2156 | 0.824 | -0.11644 | 0.756 | 5.93e-02 | 0.628 | 0.628 | NA |
| tqwt_stdValue_dec_33 | NA | -0.0105 | 0.884 | 0.43941 | 0.903 | 4.60e-01 | 0.628 | 0.628 | NA |
| tqwt_entropy_shannon_dec_20 | NA | -0.4085 | 1.117 | 0.08982 | 0.678 | 2.47e-01 | 0.622 | 0.622 | NA |
| tqwt_entropy_shannon_dec_33 | NA | -0.0183 | 0.868 | 0.32633 | 0.970 | 4.68e-01 | 0.621 | 0.621 | NA |
| tqwt_stdValue_dec_32 | NA | 0.0102 | 0.935 | 0.28472 | 0.911 | 2.65e-01 | 0.573 | 0.573 | NA |
| tqwt_minValue_dec_20 | NA | 0.2365 | 1.116 | 0.00358 | 0.744 | 4.02e-01 | 0.544 | 0.544 | 9 |
| tqwt_entropy_shannon_dec_31 | NA | 0.0405 | 0.977 | 0.14928 | 1.028 | 3.93e-01 | 0.537 | 0.537 | NA |
| tqwt_kurtosisValue_dec_31 | NA | 0.2041 | 0.837 | 0.22819 | 0.904 | 1.27e-01 | 0.490 | 0.490 | 3 |